Load Packages

In [1]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, Lars, Ridge, ElasticNet, LassoLars, LassoLarsCV, LinearRegression
import re
from umap import UMAP
import requests
import pandas as pd
from bs4 import BeautifulSoup
import seaborn as sns
import matplotlib.pyplot as plt
import gower
import pickle
from collections import Counter
import plotly.express as px
from xgboost import XGBRFRegressor
import shap
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

# import the real estate price analytics library
from lib.real_estate_analytics_library import *
In [2]:
# optional - suppress warnings
import warnings
warnings.filterwarnings('ignore')

Scrape Property Price Data

In [3]:
%%capture
browser = webdriver.Chrome(ChromeDriverManager().install())
In [4]:
# the root page link is used to generate the links for all pages
root = 'https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A20%2C%22SiteId%22%3A0%2C%22RootPropertyTypes%22%3A%5B%5D%2C%22PropertyTypes%22%3A%5B%5D%2C%22RoomsFrom%22%3Anull%2C%22RoomsTo%22%3Anull%2C%22FloorSearchType%22%3A0%2C%22LivingSpaceFrom%22%3Anull%2C%22LivingSpaceTo%22%3Anull%2C%22PriceFrom%22%3Anull%2C%22PriceTo%22%3Anull%2C%22ComparisPointsMin%22%3A0%2C%22AdAgeMax%22%3A0%2C%22AdAgeInHoursMax%22%3Anull%2C%22Keyword%22%3A%22%22%2C%22WithImagesOnly%22%3Anull%2C%22WithPointsOnly%22%3Anull%2C%22Radius%22%3Anull%2C%22MinAvailableDate%22%3A%221753-01-01T00%3A00%3A00%22%2C%22MinChangeDate%22%3A%221753-01-01T00%3A00%3A00%22%2C%22LocationSearchString%22%3A%22Z%C3%BCrich%22%2C%22Sort%22%3A3%2C%22HasBalcony%22%3Afalse%2C%22HasTerrace%22%3Afalse%2C%22HasFireplace%22%3Afalse%2C%22HasDishwasher%22%3Afalse%2C%22HasWashingMachine%22%3Afalse%2C%22HasLift%22%3Afalse%2C%22HasParking%22%3Afalse%2C%22PetsAllowed%22%3Afalse%2C%22MinergieCertified%22%3Afalse%2C%22WheelchairAccessible%22%3Afalse%2C%22LowerLeftLatitude%22%3Anull%2C%22LowerLeftLongitude%22%3Anull%2C%22UpperRightLatitude%22%3Anull%2C%22UpperRightLongitude%22%3Anull%7D&page='
In [5]:
# Open provided link in a browser window using the driver
# get the properties in Zürich, using the Comparis link for this result
browser.get('https://en.comparis.ch/immobilien/result/list?requestobject=%7B%22DealType%22%3A20%2C%22SiteId%22%3A0%2C%22RootPropertyTypes%22%3A%5B%5D%2C%22PropertyTypes%22%3A%5B%5D%2C%22RoomsFrom%22%3Anull%2C%22RoomsTo%22%3Anull%2C%22FloorSearchType%22%3A0%2C%22LivingSpaceFrom%22%3Anull%2C%22LivingSpaceTo%22%3Anull%2C%22PriceFrom%22%3Anull%2C%22PriceTo%22%3Anull%2C%22ComparisPointsMin%22%3A0%2C%22AdAgeMax%22%3A0%2C%22AdAgeInHoursMax%22%3Anull%2C%22Keyword%22%3A%22%22%2C%22WithImagesOnly%22%3Anull%2C%22WithPointsOnly%22%3Anull%2C%22Radius%22%3Anull%2C%22MinAvailableDate%22%3A%221753-01-01T00%3A00%3A00%22%2C%22MinChangeDate%22%3A%221753-01-01T00%3A00%3A00%22%2C%22LocationSearchString%22%3A%22Z%C3%BCrich%22%2C%22Sort%22%3A3%2C%22HasBalcony%22%3Afalse%2C%22HasTerrace%22%3Afalse%2C%22HasFireplace%22%3Afalse%2C%22HasDishwasher%22%3Afalse%2C%22HasWashingMachine%22%3Afalse%2C%22HasLift%22%3Afalse%2C%22HasParking%22%3Afalse%2C%22PetsAllowed%22%3Afalse%2C%22MinergieCertified%22%3Afalse%2C%22WheelchairAccessible%22%3Afalse%2C%22LowerLeftLatitude%22%3Anull%2C%22LowerLeftLongitude%22%3Anull%2C%22UpperRightLatitude%22%3Anull%2C%22UpperRightLongitude%22%3Anull%7D&page=0')
In [6]:
soup = BeautifulSoup(browser.page_source, 'html.parser')
In [7]:
# get the page number links
links = list(l['href'] for l in soup.find_all("a",{"class":"css-1yj1f35 excbu0j4"}))
In [8]:
# get the number of pages available for the location in question
num_pages = int(links[-2][links[-2].find('page=') + 5:]) + 1
In [9]:
# generate the list of pages that contain properties for the location in question
property_links = [root + str(i) for i in range(0, num_pages, 1)]
In [10]:
# define the root that we will comine with the property ID, giving us the page for each property
root = 'https://en.comparis.ch/immobilien/marktplatz/details/show/'
In [11]:
# define the list for storing the specific page for each property
pages = []

for property_link in property_links:    
    page = requests.get(property_link)

    soup = BeautifulSoup(page.content, 'html.parser')

    raw_id_list = re.findall(r'"AdId":[-+]?[0-9]+,', str(soup))

    id_list = [raw_id[raw_id.find(':') + 1:raw_id.find(',')] for raw_id in raw_id_list]

    # comine the root with the property ID, giving us the page for each property
    pages.extend([root + i for i in id_list])
In [12]:
# get the attributes for each property from the Comparis website
properties = []

for p in pages:
    page = requests.get(p)
    soup = BeautifulSoup(page.content, 'html.parser')
    property_address = list(soup.find("h3",{"class":"text-green"}))
    property_attributes = list(soup.find("dl",{"class":"row xsmall-up-2 medium-up-3 large-up-4 attributes-grid"}).stripped_strings)
    properties.append([property_address, property_attributes])
In [13]:
# check the length of the property attributes list
len(properties)
Out[13]:
133
In [14]:
# define the list of attributes that will be gathered from the scraped data
property_type = []
property_price = []
living_space = []
rooms = []
floor = []
available_date = []
public_transport = []
motorway = []
shop = []
In [15]:
# flatten the property address list
property_address = [record[0][0] for record in properties]
In [16]:
# cycle through the scraped property data and separate it into attribute-based lists that will be used to 
# create a pandas DataFrame

for record in properties:
    
    try:
        property_type.append(record[1][record[1].index('Property type') + 1])
    except:
        property_type.append(None)
    
    try:
        property_price.append(float(record[1][record[1].index('Purchase price') + 1][4:].replace(',','')))
    except:
        property_price.append(None)

    try:
        living_space.append(float(record[1][record[1].index('Living space') + 1][:-3]))
    except:
        living_space.append(None)
    
    try:
        rooms.append(get_num_rooms(record[1][record[1].index('Rooms') + 1]))
    except:
        rooms.append(None)
    
    try:
        floor.append(record[1][record[1].index('Floor') + 1])
    except:
        floor.append(None)
    
    try:
        available_date.append(record[1][record[1].index('Available') + 1])
    except:
        available_date.append(None)
    
    try:
        public_transport.append(float(record[1][record[1].index('Public transport stop') + 1][:-2]))
    except:
        public_transport.append(None)
    
    try:
        motorway.append(float(record[1][record[1].index('Motorway') + 1][:-2]))
    except:
        motorway.append(None)
    
    try:
        shop.append(float(record[1][record[1].index('Shops') + 1][:-2]))
    except:
        shop.append(None)
In [17]:
# create a pandas DataFrame that contains the raw attributes that we have gathered
property_records = pd.DataFrame(list(zip(property_address, property_type, property_price, living_space, rooms, floor, available_date, public_transport, motorway, shop)), columns =['property_address', 'property_type', 'property_price', 'living_space', 'rooms', 'floor', 'available_date', 'public_transport', 'motorway', 'shop'])
In [18]:
# show DataFrame
property_records
Out[18]:
property_address property_type property_price living_space rooms floor available_date public_transport motorway shop
0 Brandschenkestrasse, 8002 Zürich Apartment NaN NaN 2.5 None None NaN NaN NaN
1 8004 Zürich Apartment NaN 98.0 3.5 None None NaN NaN NaN
2 8046 Zürich Apartment 1385000.0 118.0 3.5 None None NaN NaN 500.0
3 Zürich, 8052 Zürich Semi-detached house 1290000.0 135.0 5.5 None Immediately NaN 915.0 385.0
4 Tuchmacherstrasse 76, 8041 Zürich Apartment 1590000.0 118.0 4.5 1. floor None NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ...
128 8005 Zürich Apartment 4850000.0 NaN 5.5 19. floor By arrangement NaN NaN NaN
129 Zanggerweg 9, 8006 Zürich Terraced/row house 3150000.0 170.0 4.5 None None NaN NaN NaN
130 8002 Zürich Single-family house 13500000.0 230.0 NaN None None NaN NaN NaN
131 Meisenrain 19, 8044 Gockhausen Single-family house 1850000.0 200.0 6.5 None None NaN NaN NaN
132 Leutschenbachstrasse 30, 8050 Zürich Attic apartment 2050000.0 136.0 3.5 19. floor None 300.0 3000.0 1000.0

133 rows × 10 columns

In [19]:
# save the scraped property records
property_records.to_csv('data/property_records_purchase.csv')

Process Data

In this section, we process the scraped web data. This involves encoding all features as the appropriate data type and performing imputation (i.e. encoding missing data points as the mean, median or mode of the existing data.

In [20]:
# load data
property_records = pd.read_csv('data/property_records_purchase.csv')
In [21]:
# display the ratio of missing values for the below features

print('living_space:', property_records.loc[property_records['living_space'].isna() == True].shape[0]/property_records.shape[0])
print('rooms:', property_records.loc[property_records['rooms'].isna() == True].shape[0]/property_records.shape[0])
print('property_address:', property_records.loc[property_records['property_address'].isna() == True].shape[0]/property_records.shape[0])
print('floor:', property_records.loc[property_records['floor'].isna() == True].shape[0]/property_records.shape[0])
print('property_type:', property_records.loc[property_records['property_type'].isna() == True].shape[0]/property_records.shape[0])
print('shop:', property_records.loc[property_records['shop'].isna() == True].shape[0]/property_records.shape[0])
print('public_transport:', property_records.loc[property_records['public_transport'].isna() == True].shape[0]/property_records.shape[0])
print('motorway:', property_records.loc[property_records['motorway'].isna() == True].shape[0]/property_records.shape[0])
living_space: 0.3157894736842105
rooms: 0.24812030075187969
property_address: 0.0
floor: 0.5263157894736842
property_type: 0.0
shop: 0.7218045112781954
public_transport: 0.7443609022556391
motorway: 0.7969924812030075
In [22]:
# process the data for use in a price prediction model, pricing analytics
property_records = process_records(property_records)
In [23]:
property_records
Out[23]:
property_address property_type property_price living_space rooms floor available_date public_transport motorway shop ... Loft Maisonette Multi-family house Other Penthouse Semi-detached house Single-family house Terraced/row house Underground garage Villa
0 Brandschenkestrasse, 8002 Zürich Apartment 5555029.38 164.32967 2.5 1. floor NaN 201.794118 1982.888889 399.567568 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 8004 Zürich Apartment 5555029.38 98.00000 3.5 1. floor NaN 201.794118 1982.888889 399.567568 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 8046 Zürich Apartment 1385000.00 118.00000 3.5 1. floor NaN 201.794118 1982.888889 500.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 Zürich, 8052 Zürich Semi-detached house 1290000.00 135.00000 5.5 1. floor Immediately 201.794118 915.000000 385.000000 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
4 Tuchmacherstrasse 76, 8041 Zürich Apartment 1590000.00 118.00000 4.5 1. floor NaN 201.794118 1982.888889 399.567568 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
128 8005 Zürich Apartment 4850000.00 164.32967 5.5 19. floor By arrangement 201.794118 1982.888889 399.567568 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
129 Zanggerweg 9, 8006 Zürich Terraced/row house 3150000.00 170.00000 4.5 1. floor NaN 201.794118 1982.888889 399.567568 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
130 8002 Zürich Single-family house 13500000.00 230.00000 4.5 1. floor NaN 201.794118 1982.888889 399.567568 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
131 Meisenrain 19, 8044 Gockhausen Single-family house 1850000.00 200.00000 6.5 1. floor NaN 201.794118 1982.888889 399.567568 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
132 Leutschenbachstrasse 30, 8050 Zürich Attic apartment 2050000.00 136.00000 3.5 19. floor NaN 300.000000 3000.000000 1000.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

133 rows × 61 columns

In [24]:
# save the processed property records
property_records.to_csv('data/processed_property_records_purchase.csv')
In [25]:
# save the possible values for each feature
with open('data/possible_postcodes_purchase.pickle', 'wb') as handle:
    pickle.dump(list(property_records['property_postcode'].unique()), handle)

with open('data/possible_floors_purchase.pickle', 'wb') as handle:
    pickle.dump(list(property_records['floor'].unique()), handle)

with open('data/possible_types_purchase.pickle', 'wb') as handle:
    pickle.dump(list(property_records['property_type'].unique()), handle)

Model Selection and Training

In this section we will select, train and save two models - one tree-based model, and one linear regression-based model. The tree-based model will be selected because it has a lower mean absolute error, while the linear regression-based model will be used to extrapolate the price of real estate that falls outside of the range of the training data (i.e. very high-value real estate), since tree-based models cannot predict values that are higher than the highest target value in the dataset on which they are trained.

Note: the linear model assumes that there is a linear relationship between price and other features such as living space and number of rooms for larger properties outside of the dataset.

The methodology used in this Jupyter notebook assumes stability in the price data for the records that were scraped - that is, we assume that the prices did not significantly change over the time period covered by the property listings.

In [26]:
# load data
property_records = pd.read_csv('data/processed_property_records_purchase.csv')
In [27]:
x = property_records[[col for col in property_records.columns if col not in ['property_price', 'Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]]
y = property_records['property_price']
In [28]:
fig = px.scatter(property_records, x="rooms", y="property_price", color="property_type", title="Purchase Price vs Number of Rooms", hover_data=['property_postcode'])
fig.show()
In [29]:
fig = px.box(property_records, x="rooms", y="property_price", title="Purchase Price vs Number of Rooms", points=False)
fig.show()
In [30]:
fig = px.scatter(property_records, x="living_space", y="property_price", color="property_type", title="Purchase Price vs Living Space", hover_data=['property_postcode'])
fig.show()
In [31]:
fig = px.box(property_records, x="property_postcode", y="property_price", title="Purchase Price vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [32]:
fig = px.box(property_records, x="floor", y="property_price", title="Purchase Price vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [33]:
fig = px.box(property_records, x="property_type", y="property_price", title="Purchase Price vs Living Space", points=False)
fig.update_xaxes(type='category')
fig.show()
In [34]:
# scale the float features
columns = ['living_space', 'rooms', 'public_transport', 'motorway', 'shop']
scaler = StandardScaler().fit(x[columns])
scaled = scaler.transform(x[columns])
scaled = pd.DataFrame(scaled, columns=['scaled_' + column for column in columns])
x = pd.concat([x, scaled], axis=1)
In [35]:
# save the scaler model for later use
with open('data/scaler_purchase.pickle', 'wb') as handle:
    pickle.dump(scaler, handle)
In [36]:
x = x.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop'])
In [37]:
# calculate the correlation_matrix matrix of the features and the dependent variable
correlation_matrix = property_records[[col for col in property_records.columns if col not in ['Unnamed: 0', 'property_address', 'available_date', 'property_type', 'floor', 'property_postcode']]].corr().loc[['property_price']].drop(['property_price'], axis=1)
In [38]:
# visualize the correlation_matrix matrix
fig, ax = plt.subplots(figsize=(15,10))
sns.heatmap(correlation_matrix, square=True, vmin=-1, vmax=1, ax=ax, linewidths=1, xticklabels=correlation_matrix.columns, cmap="Blues")
plt.yticks(rotation=0)
plt.show()
In [39]:
get_vifs(x)
Out[39]:
[('8000', inf), ('8001', inf), ('8002', inf), ('8003', inf), ('8004', inf), ('8005', inf), ('8006', inf), ('8008', inf), ('8021', inf), ('8032', inf), ('8038', inf), ('8041', inf), ('8044', inf), ('8046', inf), ('8047', inf), ('8048', inf), ('8049', inf), ('8050', inf), ('8051', inf), ('8052', inf), ('8053', inf), ('8057', inf), ('1. floor', inf), ('15. floor', inf), ('16. floor', inf), ('19. floor', inf), ('2. floor', inf), ('20. floor', inf), ('22. floor', inf), ('3. floor', inf), ('4. floor', inf), ('5. floor', inf), ('6. floor', inf), ('Basement', inf), ('Ground floor', inf), ('Apartment', inf), ('Attic apartment', inf), ('Building land', inf), ('Commercial land', inf), ('Commercial property', inf), ('Loft', inf), ('Maisonette', inf), ('Multi-family house', inf), ('Other', inf), ('Penthouse', inf), ('Semi-detached house', inf), ('Single-family house', inf), ('Terraced/row house', inf), ('Underground garage', inf), ('Villa', inf), ('scaled_motorway', 3.485223675478757), ('scaled_public_transport', 2.902213932330284), ('scaled_shop', 2.452805934000675), ('scaled_living_space', 2.023947152375155), ('scaled_rooms', 1.919405452350377)]

The above VIFs indicate, as expected, serious multicolinearty in the data. This is because of the the one hot encoding of the categorical data. In order to fix this problem, we can eliminate a column from each of the categorical feature sets. We will select the columns below, based on their frequency in the data. This should not result in any significant loss in the performance of the model, as the removed values will still be indicated in the data (because all of the remaining columns/features will be 0 if the removed value is present). For example, if we remove the 'Apartment' encoding, then any record for an apartment will have all other property_type encoding set to 0 (e.g features such as 'Single garage' will all be equal to 0).

In [40]:
sorted(Counter(property_records['property_postcode']).items(), key=lambda v: v[1], reverse=True)
Out[40]:
[(8052, 16), (8002, 10), (8044, 10), (8001, 10), (8046, 9), (8050, 9), (8049, 9), (8005, 8), (8053, 7), (8000, 7), (8048, 6), (8006, 6), (8041, 5), (8038, 5), (8032, 4), (8008, 3), (8003, 3), (8051, 2), (8004, 1), (8057, 1), (8021, 1), (8047, 1)]
In [41]:
sorted(Counter(property_records['floor']).items(), key=lambda v: v[1], reverse=True)
Out[41]:
[('1. floor', 90), ('Ground floor', 19), ('3. floor', 5), ('2. floor', 4), ('4. floor', 3), ('19. floor', 3), ('16. floor', 3), ('15. floor', 1), ('5. floor', 1), ('6. floor', 1), ('Basement', 1), ('20. floor', 1), ('22. floor', 1)]
In [42]:
sorted(Counter(property_records['property_type']).items(), key=lambda v: v[1], reverse=True)
Out[42]:
[('Apartment', 57), ('Commercial property', 23), ('Single-family house', 13), ('Attic apartment', 8), ('Multi-family house', 7), ('Maisonette', 7), ('Other', 5), ('Villa', 4), ('Loft', 3), ('Semi-detached house', 1), ('Underground garage', 1), ('Building land', 1), ('Commercial land', 1), ('Penthouse', 1), ('Terraced/row house', 1)]
In [43]:
# define the columns that are to be eliminated from the input features to the Linear Regression model. This is to 
# eliminated multicolinearity.
eliminated_columns = ['8052', '1. floor', 'Apartment']
In [44]:
# The below VIFs for the reduced data indicate no multicolinearity.
get_vifs(x.drop(columns=eliminated_columns))
Out[44]:
[('8051', 3.6774453865982646), ('scaled_motorway', 3.485206034785232), ('Basement', 3.306974457897832), ('scaled_public_transport', 2.901026384458236), ('3. floor', 2.889610831059518), ('Maisonette', 2.7888279017695106), ('Single-family house', 2.5107284006354345), ('Commercial property', 2.4751986686952594), ('scaled_shop', 2.4033868182648903), ('8057', 2.3549440422134356), ('8005', 2.3088346484646673), ('8000', 2.0841023982521545), ('4. floor', 2.0738124134140175), ('Loft', 2.0733485291112568), ('8001', 2.0231457457048503), ('8050', 2.0118194341449773), ('8048', 1.9872219755524887), ('scaled_living_space', 1.982514594196613), ('Multi-family house', 1.9716280192158608), ('scaled_rooms', 1.9145770196580227), ('Villa', 1.879072168589809), ('8046', 1.8206896902269918), ('Penthouse', 1.7731841289601185), ('Ground floor', 1.7105778444544075), ('8044', 1.6608894660270588), ('Other', 1.6562786060584274), ('19. floor', 1.6393235403275979), ('8038', 1.634470949567253), ('8006', 1.5609199180914555), ('8032', 1.538732747248442), ('Attic apartment', 1.500467531876949), ('16. floor', 1.4943215282886915), ('Underground garage', 1.4670200636648436), ('8008', 1.4488193191032661), ('8003', 1.434583061808449), ('8049', 1.4312854842915237), ('8002', 1.3829915466386857), ('2. floor', 1.3652762563319154), ('Building land', 1.3304987296598507), ('20. floor', 1.3116765206798187), ('15. floor', 1.3042071711591947), ('22. floor', 1.288004942959726), ('Terraced/row house', 1.2579682545138973), ('8041', 1.2362446964667644), ('5. floor', 1.2322719658702772), ('6. floor', 1.2031824340741775), ('Commercial land', 1.2015513861504008), ('8021', 1.1986916997045667), ('8053', 1.109575592845646), ('Semi-detached house', 1.0895515839105117), ('8004', 1.006784930094682), ('8047', 1.0017895041717724)]
In [45]:
# save the list of eliminated columns for later use
with open('data/eliminated_columns_purchase.pickle', 'wb') as handle:
    pickle.dump(eliminated_columns, handle)
In [46]:
# remove the outliers detected by Tukey's test - this reduced dataset will be used in the training of the linear 
# models
xe, ye = remove_outliers_tukeys_test(x.drop(columns=eliminated_columns), y)
In [47]:
# use the Gower distance to scale the data for inout into UMAP dimensionality-reduction, which takes into account
# the float inputs and their interaction with the one hot-encoded data
umap_results = UMAP(n_neighbors=20).fit_transform(gower.gower_matrix(pd.concat([y, x], axis=1)))
In [48]:
outlier_indices = get_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
normal_indices = [i for i in range(0, x.shape[0], 1) if i not in outlier_indices]
In [49]:
outliers = pd.DataFrame(zip([v[0] for v in umap_results[outlier_indices]], [v[1] for v in umap_results[outlier_indices]], ['Outlier' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
In [50]:
normal = pd.DataFrame(zip([v[0] for v in umap_results[normal_indices]], [v[1] for v in umap_results[normal_indices]], ['Normal' for i in range(0, len(outlier_indices), 1)]), columns=['Dimension 1', 'Dimension 2', 'Status'])
In [51]:
# save the UMAP results as a pandas DataFrame
umap_data = pd.concat([normal, outliers]).reset_index(drop=True)
In [52]:
# plot the UMAP results, showing the outliers vs normal data points, based on the isolation forest model
fig = px.scatter(umap_data, x="Dimension 1", y="Dimension 2", color="Status", title="UMAP Result", hover_data=[umap_data.index.values])
fig.show()
In [53]:
# remove the outliers detected by the isolation forest - this reduced dataset will be used in the training of the 
# tree-based models
xt, yt = remove_outliers_isolation_forest(x, y, n_estimators=100, contamination=0.06)
In [54]:
model_types = [['Lasso', Lasso()], ['Ridge', Ridge()], ['ElasticNet', ElasticNet()], ['LassoLars', LassoLars()], ['LassoLarsCV', LassoLarsCV()], ['Lars', Lars()], ['LinearRegression', LinearRegression()]]
In [55]:
model_results = train_model(xe, ye, model_types, 5)
In [56]:
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
In [57]:
top_models
Out[57]:
[['LinearRegression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False), 1.2762702075140752e+17, 2.954212070841129e+16, 841518.3287654569], ['LassoLars', LassoLars(alpha=0.016177769541090074, copy_X=True, eps=2.220446049250313e-16, fit_intercept=True, fit_path=True, max_iter=500, normalize=True, positive=False, precompute='auto', verbose=False), 4384988.487739325, 2103438.033638439, 848007.7967656173], ['Ridge', Ridge(alpha=0.09136044077672557, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001), 4230191.242065761, 2050377.8305851382, 911827.0880439288], ['ElasticNet', ElasticNet(alpha=0.0004326599104942376, copy_X=True, fit_intercept=True, l1_ratio=0.5, max_iter=1000, normalize=False, positive=False, precompute=False, random_state=None, selection='cyclic', tol=0.0001, warm_start=False), 4096662.5248047384, 1975027.1535115722, 918616.7495854333], ['LassoLarsCV', LassoLarsCV(copy_X=True, cv=None, eps=2.220446049250313e-16, fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=None, normalize=True, positive=False, precompute='auto', verbose=False), 4037027.980526985, 2032690.7790741096, 995280.0998515591]]
In [58]:
# train the best model on the expanded dataset
linear_pricing_model = model_results[0][1].fit(xe, ye)
In [59]:
linear_pricing_model
Out[59]:
Lasso(alpha=0.016300369859997724, copy_X=True, fit_intercept=True,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
In [60]:
# save the selected model
with open('models/linear_pricing_model_purchase.pickle', 'wb') as handle:
    pickle.dump(linear_pricing_model, handle)
In [61]:
# calculate feature importances based on the regression coefficients
regression_interpretation = pd.DataFrame(sorted(list(zip(xe.columns, linear_pricing_model.coef_)), key=lambda v: abs(v[1]), reverse=False), columns=['Feature', 'Weight'])
In [62]:
# plot the regression corefficient-based feature importances
fig = px.scatter(regression_interpretation, x="Weight", y="Feature")
fig.update_yaxes(type='category')
fig.show()
In [63]:
model_types = [['XGBRFRegressor', XGBRFRegressor()], ['AdaBoostRegressor', AdaBoostRegressor()], ['RandomForestRegressor', RandomForestRegressor()], ['ExtraTreesRegressor', ExtraTreesRegressor()], ['DecisionTreeRegressor', DecisionTreeRegressor()], ['GradientBoostingRegressor', GradientBoostingRegressor()]]
In [64]:
model_results = train_model(xt, yt, model_types, 3)
In [65]:
# get the top 5 results, selected based on the mae metric
top_models = sorted(model_results, key=lambda v: v[4], reverse=False)[:5]
In [66]:
top_models
Out[66]:
[['ExtraTreesRegressor', ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse', max_depth=18, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=19, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False), 14121319.5724222, 4707514.93939634, 796930.0658691813], ['DecisionTreeRegressor', DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=18, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=None, splitter='best'), 16221116.560738131, 5062134.707590011, 935828.4366666666], ['XGBRFRegressor', XGBRFRegressor(base_score=0.5, booster=None, colsample_bylevel=1, colsample_bynode=0.8, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints=None, learning_rate=1, max_delta_step=0, max_depth=13, min_child_weight=1, missing=nan, monotone_constraints=None, n_estimators=39, n_jobs=0, num_parallel_tree=39, objective='reg:squarederror', random_state=0, reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=1, subsample=0.8, tree_method=None, validate_parameters=False, verbosity=None), 18363637.18349646, 5988331.630039197, 1184653.5633333332], ['RandomForestRegressor', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse', max_depth=15, max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=13, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False), 14672688.66694996, 5601275.099461998, 1318449.1947033687], ['GradientBoostingRegressor', GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='ls', max_depth=7, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=27, n_iter_no_change=None, presort='deprecated', random_state=None, subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False), 13648329.587012762, 4762197.95579793, 1328034.043243566]]
In [67]:
# train the best model on the expanded dataset
pricing_model = top_models[0][1].fit(xt, yt)
In [68]:
pricing_model
Out[68]:
ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=18, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=19, n_jobs=None, oob_score=False,
                    random_state=None, verbose=0, warm_start=False)
In [69]:
# save the selected model
with open('models/pricing_model_purchase.pickle', 'wb') as handle:
    pickle.dump(pricing_model, handle)
In [70]:
# calculate and show the raw SHAP values for the model
# reference: https://christophm.github.io/interpretable-ml-book/shap.html

# load JS visualization code to notebook
shap.initjs()

explainer = shap.TreeExplainer(pricing_model)
shap_values = explainer.shap_values(xt)

shap.summary_plot(shap_values, xt)
In [71]:
# show the SHAP value-based relative model feature importances
shap.summary_plot(shap_values, xt, plot_type="bar")

Predict the price of any given property

In [72]:
# show the possible values for each feature
with open('data/possible_postcodes_purchase.pickle', 'rb') as handle:
    print('Possible Postcodes =', pickle.load(handle))

print('')

with open('data/possible_floors_purchase.pickle', 'rb') as handle:
    print('Possible Floors =', pickle.load(handle))

print('')

with open('data/possible_types_purchase.pickle', 'rb') as handle:
    print('Possible Property Types =', pickle.load(handle))
Possible Postcodes = ['8002', '8004', '8046', '8052', '8041', '8050', '8053', '8005', '8044', '8048', '8001', '8008', '8051', '8049', '8006', '8000', '8038', '8032', '8057', '8003', '8021', '8047']

Possible Floors = ['1. floor', '15. floor', 'Ground floor', '4. floor', '5. floor', '2. floor', '19. floor', '16. floor', '6. floor', '3. floor', 'Basement', '20. floor', '22. floor']

Possible Property Types = ['Apartment', 'Semi-detached house', 'Commercial property', 'Single-family house', 'Multi-family house', 'Underground garage', 'Other', 'Maisonette', 'Villa', 'Building land', 'Loft', 'Attic apartment', 'Commercial land', 'Penthouse', 'Terraced/row house']
In [73]:
# load data
property_records = pd.read_csv('data/processed_property_records_purchase.csv')
In [74]:
# load the pre-trained models and other required data from pickle files
with open('models/pricing_model_purchase.pickle', 'rb') as handle:
    pricing_model = pickle.load(handle)

with open('models/linear_pricing_model_purchase.pickle', 'rb') as handle:
    linear_pricing_model = pickle.load(handle)
    
with open('data/eliminated_columns_purchase.pickle', 'rb') as handle:
    eliminated_columns = pickle.load(handle)

with open('data/scaler_purchase.pickle', 'rb') as handle:
    scaler = pickle.load(handle)
    
with open('data/encoder_purchase.pickle', 'rb') as handle:
    encoder = pickle.load(handle)
In [75]:
# define the feature values for the property
living_space = 140
rooms = 6
postcode = '8001'
floor = '1. floor'
property_type = 'Apartment'
public_transport = 100
motorway = 100
shop = 100
In [76]:
input_values = encode_input(living_space, rooms, postcode, floor, property_type, public_transport, motorway, shop, scaler, encoder)
In [77]:
input_values
Out[77]:
living_space rooms public_transport motorway shop property_postcode floor property_type 8000 8001 ... Semi-detached house Single-family house Terraced/row house Underground garage Villa scaled_living_space scaled_rooms scaled_public_transport scaled_motorway scaled_shop
0 140 6 100 100 100 8001 1. floor Apartment 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 -0.21447 0.130106 -1.642217 -3.237934 -2.056436

1 rows × 63 columns

In [78]:
# use one of: [regression_model, tree_model]
model_type = 'tree_model'
In [79]:
# calculate price
if model_type == 'regression_model':
    price = linear_pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type'] + eliminated_columns))[0]
else:
    price = pricing_model.predict(input_values.drop(columns=['living_space', 'rooms', 'public_transport', 'motorway', 'shop', 'property_postcode', 'floor', 'property_type']))[0]
    
calculated_price = pd.concat([pd.DataFrame([price], columns=['property_price']), input_values], axis=1)
In [80]:
calculated_price
Out[80]:
property_price living_space rooms public_transport motorway shop property_postcode floor property_type 8000 ... Semi-detached house Single-family house Terraced/row house Underground garage Villa scaled_living_space scaled_rooms scaled_public_transport scaled_motorway scaled_shop
0 5.391077e+06 140 6 100 100 100 8001 1. floor Apartment 0.0 ... 0.0 0.0 0.0 0.0 0.0 -0.21447 0.130106 -1.642217 -3.237934 -2.056436

1 rows × 64 columns

In [81]:
# the predicted price of the property is shown as a red cross, and is plotted alongside properties that are in 
# it's peer group (i.e. properties that have the same number of rooms and the same property type)
fig = px.scatter(property_records[(property_records['rooms'] == rooms) & (property_records['property_type'] == property_type)], x="living_space", y="property_price", color="property_type", hover_data=['living_space'])
fig1 = px.scatter(calculated_price, x="living_space", y="property_price", title="Calculated Price vs Peer Group", hover_data=['property_postcode'])
fig1.update_traces(marker=dict(size=10, color='Red', symbol='x'))
fig.add_trace(fig1.data[0])
fig.show()
In [ ]: